/*

	This file cleans the Danish Prodcom register and consists of 2 sections:
	1)	Impute missing firm identifiers (jurnr) from alternative identifiers (rapport_id and saflbnr).
		This exercise is based on advice from DST.
	2)	Collape data to firm-product-year level, where "product" is a CN8+ times unit of measurement pair

*/

cd ${work}2_prepare

********************************************************************************
*** SECTION 1: IMPUTE MISSING JURNR

*FIRST, USE RAPID. DATA BREAK 2006-2007, SO DON'T IMPUTE "ACROSS" THESE PERIODS
foreach n of numlist 1/2 {
	
use input\vars_ipt, clear

if `n'==1 keep if year<=2006
if `n'==2 keep if year>=2007

keep jurnr rapid year date
keep if real(rapid)!=.
duplicates drop		//UNIQUE IN RAPID-DATE
gen mis=(jurnr=="")
sort rapid year date jurnr mis
order rapid jurnr year date mis

by rapid: egen min_mis=min(mis)
by rapid: egen max_mis=max(mis)
keep if min_mis!=max_mis
drop *_mis

*"SPELL"
by rapid: gen temp=1 if _n==1 | mis!=mis[_n-1] | jurnr!=jurnr[_n-1]
sort temp rapid date
by temp: gen spell=_n if temp==1
sort rapid date
replace spell=spell[_n-1] if spell==. & spell[_n-1]!=.
sort spell date
drop temp

*START IMPUTING INITIAL MISSINGS
gen init_mis=(mis==1 & (rapid!=rapid[_n-1] | _n==1))
by spell: gen no_mis=_N
replace jurnr=jurnr[_n+no_mis] if init_mis==1
drop init_mis no_mis mis

*THEN THE REST
replace jurnr=jurnr[_n-1] if jurnr=="" & jurnr[_n-1]!="" & rapid==rapid[_n-1]

*SAVE
rename jurnr jurnr_rapid_`n'
keep rapid jurnr_rapid_`n' date
duplicates drop
save temp\imputed_rapid_`n', replace

}

**THEN, SUPPLEMENTARY, USE SAFLBNR
foreach n of numlist 1/2 {
	
use input\vars_ipt, clear

keep jurnr saflbnr year date
keep if real(saflbnr)!=.
duplicates drop		//ALMOST(!) UNIQUE IN SAFLBNR-DATE

if `n'==1 keep if year<=2006
if `n'==2 keep if year>=2007

duplicates tag saflbnr date, gen(dup0)
bysort saflbnr: egen dup=max(dup0)
keep if dup==0
drop dup0 dup

gen mis=(jurnr=="")
sort saflbnr year date jurnr mis
order saflbnr jurnr year date mis

by saflbnr: egen min_mis=min(mis)
by saflbnr: egen max_mis=max(mis)
keep if min_mis!=max_mis
drop *_mis

*"SPELL"
by saflbnr: gen temp=1 if _n==1 | mis!=mis[_n-1] | jurnr!=jurnr[_n-1]
sort temp saflbnr date
by temp: gen spell=_n if temp==1
sort saflbnr date
replace spell=spell[_n-1] if spell==. & spell[_n-1]!=.
sort spell date
drop temp

*START IMPUTING INITIAL MISSINGS
gen init_mis=(mis==1 & (saflbnr!=saflbnr[_n-1] | _n==1))
by spell: gen no_mis=_N
replace jurnr=jurnr[_n+no_mis] if init_mis==1
drop init_mis no_mis mis

*THEN THE REST
replace jurnr=jurnr[_n-1] if jurnr=="" & jurnr[_n-1]!="" & saflbnr==saflbnr[_n-1]

*SAVE
rename jurnr jurnr_saflbnr_`n'
keep saflbnr jurnr_saflbnr_`n' date
duplicates drop
save temp\imputed_saflbnr_`n', replace

}

**MERGE ON AND IMPUTE
use input\vars_ipt, clear

merge m:1 rapid date using temp\imputed_rapid_1, assert(1 3) nogen
merge m:1 rapid date using temp\imputed_rapid_2, assert(1 3) nogen
merge m:1 saflbnr date using temp\imputed_saflbnr_1, assert(1 3) nogen
merge m:1 saflbnr date using temp\imputed_saflbnr_2, assert(1 3) nogen
foreach x in imputed_rapid_1 imputed_rapid_2 imputed_saflbnr_1 imputed_saflbnr_2 {
	erase temp/`x'.dta
}

count if jurnr_rapid_1!=jurnr_saflbnr_1 & jurnr_rapid_1!="" & jurnr_saflbnr_1!=""
count if jurnr_rapid_2!=jurnr_saflbnr_2 & jurnr_rapid_2!="" & jurnr_saflbnr_2!=""
*FEW! REMOVE THESE ENTIRELY
gen out=((jurnr_rapid_1!=jurnr_saflbnr_1 & jurnr_rapid_1!="" & jurnr_saflbnr_1!="") | jurnr_rapid_2!=jurnr_saflbnr_2 & jurnr_rapid_2!="" & jurnr_saflbnr_2!="")
bysort rapid: egen out_rap=max(out)
bysort saflbnr: egen out_saf=max(out)
keep if out_rap==0 & out_saf==0
drop out*

foreach v of varlist jurnr_rapid_1 jurnr_saflbnr_1 jurnr_rapid_2 jurnr_saflbnr_2 {
	replace jurnr = `v' if jurnr=="" & `v'!=""
	}
drop jurnr_* saflbnr rapid

**DROP FIRMS WITH MISSING JURNR EVEN AFTER IMPUTATION
keep if jurnr!=""
	

	
********************************************************************************
*** SECTION 2: COLLAPSE DATA TO FIRM-PRODUCT-YEAR
	
*DROP IRRELEVANT PRODUCT CODES AND YEARS
keep if vnr>100 & year>=2002

*DEFINE FIRM-YEAR LEVEL INDUSTRY
preserve
collapse (sum) vrd, by(jurnr year db07)
gsort jurnr year -vrd
drop vrd
duplicates drop jurnr year, force
save temp\temp, replace
restore

*COLLAPSE TO YEARLY LEVEL
gen vrd_mgd=vrd if mgd!=.
collapse (sum) vrd vrd_mgd mgd, by(jurnr vnr mgd_tekst year) fast

merge m:1 jurnr year using temp\temp, assert(3) nogen
erase temp\temp.dta

*SHARE OF COVERED VALUE, FIRM-YEAR LEVEL
bysort jurnr year: egen vrd_tot=total(vrd)
bysort jurnr year: egen vrd_mgd_tot=total(vrd_mgd)
gen share=vrd_mgd_tot/vrd_tot
drop vrd_mgd *_tot

*EXCLUDE OBSERVATIONS WITH MISSING/ZERO QUANTITY
keep if mgd!=. & mgd!=0

*CONCORDANCE
gen double cn8=floor(vnr/100)
merge m:1 cn8 year using input\cn8_cn8plus_2002_2015, keepusing(cn8plus) keep(matched) nogen

*WE MUST ASSIGN A CN6 CATEGORY TO ALL CN8+ CODES (SOME OF THEM COLLECT CN8 CODES THAT OVERLAP CN6 CATEGORIES)
gen double cn6=floor(cn8/100)
bysort cn8plus cn6: egen vrd_cn6=total(vrd)
bysort cn8plus: egen topvrd_cn6=max(vrd_cn6)
gen top_cn6_temp=cn6 if vrd_cn6==topvrd_cn6
bysort cn8plus: egen top_cn6=mean(top_cn6_temp)
drop vnr cn8 vrd_cn6 topvrd_cn6 top_cn6_temp cn6
rename cn8plus vnr
rename top_cn6 vnr6
collapse (sum) vrd mgd , by(jurnr vnr vnr6 mgd_tekst year share db07) fast
	
*SAVE
compress
save temp\vars_ipt, replace		